#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <float.h>
#include <math.h>

#include "SSE2NEONBinding.h"
#include "SSE2NEONTEST.h"

// SSE2NEONTEST performs a set of 'unit tests' making sure that each SSE call
// provides the output we expect.  If this fires an assert, then something didn't match up.


#ifdef WIN32

#pragma warning(disable:4211)

#include <xmmintrin.h>
#include <emmintrin.h>

#else

#include "SSE2NEON.h"

#endif

namespace SSE2NEON
{

// hex representation of an IEEE NAN
const uint32_t inan = 0xffffffff;

static inline float getNAN(void)
{
    const float *fn = (const float *)&inan;
    return *fn;
}

static inline bool isNAN(float a)
{
    const uint32_t *ia = (const uint32_t *)&a;
    return (*ia) == inan ? true : false;
}

// Do a round operation that produces results the same as SSE instructions
static inline float bankersRounding(float val)
{
    if (val < 0)
    {
        return -bankersRounding(-val);
    }
    float ret;
    int32_t truncateInteger = int32_t(val);
    int32_t roundInteger = int32_t(val + 0.5f);
    float diff1 = val - float(truncateInteger); // Truncate value
    float diff2 = val - float(roundInteger);    // Round up value
    if (diff2 < 0) diff2 *= -1; // get the positive difference from the round up value
    // If it's closest to the truncate integer; then use it
    if (diff1 < diff2)
    {
        ret = float(truncateInteger);
    }
    else if (diff2 < diff1) // if it's closest to the round-up integer; use it
    {
        ret = float(roundInteger);
    }
    else
    {
        // If it's equidistant between rounding up and rounding down, pick the one which is an even number
        if (truncateInteger & 1) // If truncate is odd, then return the rounded integer
        {
            ret = float(roundInteger);
        }
        else
        {
            // If the rounded up value is odd, use return the truncated integer
            ret = float(truncateInteger);
        }
    }
    return ret;
}


    const char *SSE2NEONTest::getInstructionTestString(InstructionTest test)
    {
        const char *ret = "UNKNOWN!";

        switch (test)
        {
        case IT_MM_SETZERO_SI128:
            ret = "MM_SETZERO_SI128";
            break;
        case IT_MM_SETZERO_PS:
            ret = "MM_SETZERO_PS";
            break;
        case IT_MM_SET1_PS:
            ret = "MM_SET1_PS";
            break;
        case IT_MM_SET_PS1:
            ret = "MM_SET_PS1";
            break;
        case IT_MM_SET_PS:
            ret = "MM_SET_PS";
            break;
        case IT_MM_SETR_PS:
            ret = "MM_SETR_PS";
            break;
        case IT_MM_SET1_EPI32:
            ret = "MM_SET1_EPI32";
            break;
        case IT_MM_SET_EPI32:
            ret = "MM_SET_EPI32";
            break;
        case IT_MM_STORE_PS:
            ret = "MM_STORE_PS";
            break;
        case IT_MM_STOREU_PS:
            ret = "MM_STOREU_PS";
            break;
        case IT_MM_STORE_SI128:
            ret = "MM_STORE_SI128";
            break;
        case IT_MM_STORE_SS:
            ret = "MM_STORE_SS";
            break;
        case IT_MM_STOREL_EPI64:
            ret = "MM_STOREL_EPI64";
            break;
        case IT_MM_LOAD1_PS:
            ret = "MM_LOAD1_PS";
            break;
        case IT_MM_LOAD_PS:
            ret = "MM_LOAD_PS";
            break;
        case IT_MM_LOADU_PS:
            ret = "MM_LOADU_PS";
            break;
        case IT_MM_LOAD_SS:
            ret = "MM_LOAD_SS";
            break;
        case IT_MM_CMPNEQ_PS:
            ret = "MM_CMPNEQ_PS";
            break;
        case IT_MM_ANDNOT_PS:
            ret = "MM_ANDNOT_PS";
            break;
        case IT_MM_ANDNOT_SI128:
            ret = "MM_ANDNOT_SI128";
            break;
        case IT_MM_AND_SI128:
            ret = "MM_AND_SI128";
            break;
        case IT_MM_AND_PS:
            ret = "MM_AND_PS";
            break;
        case IT_MM_OR_PS:
            ret = "MM_OR_PS";
            break;
        case IT_MM_XOR_PS:
            ret = "MM_XOR_PS";
            break;
        case IT_MM_OR_SI128:
            ret = "MM_OR_SI128";
            break;
        case IT_MM_XOR_SI128:
            ret = "MM_XOR_SI128";
            break;
        case IT_MM_MOVEMASK_PS:
            ret = "MM_MOVEMASK_PS";
            break;
        case IT_MM_SHUFFLE_EPI32_DEFAULT:
            ret = "MM_SHUFFLE_EPI32_DEFAULT";
            break;
        case IT_MM_SHUFFLE_EPI32_FUNCTION:
            ret = "MM_SHUFFLE_EPI32_FUNCTION";
            break;
        case IT_MM_SHUFFLE_EPI32_SPLAT:
            ret = "MM_SHUFFLE_EPI32_SPLAT";
            break;
        case IT_MM_SHUFFLE_EPI32_SINGLE:
            ret = "MM_SHUFFLE_EPI32_SINGLE";
            break;
        case IT_MM_SHUFFLEHI_EPI16_FUNCTION:
            ret = "MM_SHUFFLEHI_EPI16_FUNCTION";
            break;
        case IT_MM_MOVEMASK_EPI8:
            ret = "MM_MOVEMASK_EPI8";
            break;
        case IT_MM_SUB_PS:
            ret = "MM_SUB_PS";
            break;
        case IT_MM_SUB_EPI32:
            ret = "MM_SUB_EPI32";
            break;
        case IT_MM_ADD_PS:
            ret = "MM_ADD_PS";
            break;
        case IT_MM_ADD_SS:
            ret = "MM_ADD_SS";
            break;
        case IT_MM_ADD_EPI32:
            ret = "MM_ADD_EPI32";
            break;
        case IT_MM_ADD_EPI16:
            ret = "MM_ADD_EPI16";
            break;
        case IT_MM_MULLO_EPI16:
            ret = "MM_MULLO_EPI16";
            break;
        case IT_MM_MULLO_EPI32:
            ret = "MM_MULLO_EPI32";
            break;
        case IT_MM_MUL_PS:
            ret = "MM_MUL_PS";
            break;
        case IT_MM_DIV_PS:
            ret = "MM_DIV_PS";
            break;
        case IT_MM_DIV_SS:
            ret = "MM_DIV_SS";
            break;
        case IT_MM_RCP_PS:
            ret = "MM_RCP_PS";
            break;
        case IT_MM_SQRT_PS:
            ret = "MM_SQRT_PS";
            break;
        case IT_MM_SQRT_SS:
            ret = "MM_SQRT_SS";
            break;
        case IT_MM_RSQRT_PS:
            ret = "MM_RSQRT_PS";
            break;
        case IT_MM_MAX_PS:
            ret = "MM_MAX_PS";
            break;
        case IT_MM_MIN_PS:
            ret = "MM_MIN_PS";
            break;
        case IT_MM_MAX_SS:
            ret = "MM_MAX_SS";
            break;
        case IT_MM_MIN_SS:
            ret = "MM_MIN_SS";
            break;
        case IT_MM_MIN_EPI16:
            ret = "MM_MIN_EPI16";
            break;
        case IT_MM_MAX_EPI32:
            ret = "MM_MAX_EPI32";
            break;
        case IT_MM_MIN_EPI32:
            ret = "MM_MIN_EPI32";
            break;
        case IT_MM_MULHI_EPI16:
            ret = "MM_MULHI_EPI16";
            break;
        case IT_MM_HADD_PS:
            ret = "MM_HADD_PS";
            break;
        case IT_MM_CMPLT_PS:
            ret = "MM_CMPLT_PS";
            break;
        case IT_MM_CMPGT_PS:
            ret = "MM_CMPGT_PS";
            break;
        case IT_MM_CMPGE_PS:
            ret = "MM_CMPGE_PS";
            break;
        case IT_MM_CMPLE_PS:
            ret = "MM_CMPLE_PS";
            break;
        case IT_MM_CMPEQ_PS:
            ret = "MM_CMPEQ_PS";
            break;
        case IT_MM_CMPLT_EPI32:
            ret = "MM_CMPLT_EPI32";
            break;
        case IT_MM_CMPGT_EPI32:
            ret = "MM_CMPGT_EPI32";
            break;
        case IT_MM_CMPORD_PS:
            ret = "MM_CMPORD_PS";
            break;
        case IT_MM_COMILT_SS:
            ret = "MM_COMILT_SS";
            break;
        case IT_MM_COMIGT_SS:
            ret = "MM_COMIGT_SS";
            break;
        case IT_MM_COMILE_SS:
            ret = "MM_COMILE_SS";
            break;
        case IT_MM_COMIGE_SS:
            ret = "MM_COMIGE_SS";
            break;
        case IT_MM_COMIEQ_SS:
            ret = "MM_COMIEQ_SS";
            break;
        case IT_MM_COMINEQ_SS:
            ret = "MM_COMINEQ_SS";
            break;
        case IT_MM_CVTTPS_EPI32:
            ret = "MM_CVTTPS_EPI32";
            break;
        case IT_MM_CVTEPI32_PS:
            ret = "MM_CVTEPI32_PS";
            break;
        case IT_MM_CVTPS_EPI32:
            ret = "MM_CVTPS_EPI32";
            break;
        case IT_MM_CVTSI128_SI32:
            ret = "MM_CVTSI128_SI32";
            break;
        case IT_MM_CVTSI32_SI128:
            ret = "MM_CVTSI32_SI128";
            break;
        case IT_MM_CASTPS_SI128:
            ret = "MM_CASTPS_SI128";
            break;
        case IT_MM_CASTSI128_PS:
            ret = "MM_CASTSI128_PS";
            break;
        case IT_MM_LOAD_SI128:
            ret = "MM_LOAD_SI128";
            break;
        case IT_MM_PACKS_EPI16:
            ret = "MM_PACKS_EPI16";
            break;
        case IT_MM_PACKUS_EPI16:
            ret = "MM_PACKUS_EPI16";
            break;
        case IT_MM_PACKS_EPI32:
            ret = "MM_PACKS_EPI32";
            break;
        case IT_MM_UNPACKLO_EPI8:
            ret = "MM_UNPACKLO_EPI8";
            break;
        case IT_MM_UNPACKLO_EPI16:
            ret = "MM_UNPACKLO_EPI16";
            break;
        case IT_MM_UNPACKLO_EPI32:
            ret = "MM_UNPACKLO_EPI32";
            break;
        case IT_MM_UNPACKLO_PS:
            ret = "MM_UNPACKLO_PS";
            break;
        case IT_MM_UNPACKHI_PS:
            ret = "MM_UNPACKHI_PS";
            break;
        case IT_MM_UNPACKHI_EPI8:
            ret = "MM_UNPACKHI_EPI8";
            break;
        case IT_MM_UNPACKHI_EPI16:
            ret = "MM_UNPACKHI_EPI16";
            break;
        case IT_MM_UNPACKHI_EPI32:
            ret = "MM_UNPACKHI_EPI32";
            break;
        case IT_MM_SFENCE:
            ret = "MM_SFENCE";
            break;
        case IT_MM_STREAM_SI128:
            ret = "MM_STREAM_SI128";
            break;
        case IT_MM_CLFLUSH:
            ret = "MM_CLFLUSH";
            break;
        case IT_MM_SHUFFLE_PS:
            ret = "MM_SHUFFLE_PS";
            break;
            
        case IT_MM_CVTSS_F32:
            ret = "MM_CVTSS_F32";
            break;

        //added by hasindu
        case IT_MM_SET1_EPI16:
            ret = "MM_SET1_EPI16";
            break;
        case IT_MM_SET_EPI16:
            ret = "MM_SET_EPI16";
            break;       
        case IT_MM_SLLI_EPI16:
            ret = "MM_SLLI_EPI16";
            break;
        case IT_MM_SRLI_EPI16:
            ret = "MM_SRLI_EPI16";
            break;
        case IT_MM_CMPEQ_EPI16:
            ret = "MM_CMPEQ_EPI16";
            break;

        case IT_MM_SET1_EPI8:
            ret = "MM_SET1_EPI8";
            break;
        case IT_MM_ADDS_EPU8:
            ret = "MM_ADDS_EPU8";
            break;
        case IT_MM_SUBS_EPU8:
            ret = "MM_SUBS_EPU8";
            break;
        case IT_MM_MAX_EPU8:
            ret = "MM_MAX_EPU8";
            break;
        case IT_MM_CMPEQ_EPI8:
            ret = "MM_CMPEQ_EPI8";
            break;
        case IT_MM_ADDS_EPI16:
            ret = "MM_ADDS_EPI16";
            break;
        case IT_MM_MAX_EPI16:
            ret = "MM_MAX_EPI16";
            break;
        case IT_MM_SUBS_EPU16:
            ret = "MM_SUBS_EPU16";
            break;
        case IT_MM_CMPGT_EPI16:
            ret = "MM_CMPGT_EPI16";
            break;
        case IT_MM_LOADU_SI128:
            ret = "MM_LOADU_SI128";
            break;
        case IT_MM_STOREU_SI128:
            ret = "MM_STOREU_SI128";
            break;
        case IT_MM_ADD_EPI8:
            ret = "MM_ADD_EPI8";
            break;
        case IT_MM_CMPGT_EPI8:
            ret = "MM_CMPGT_EPI8";
            break;
        case IT_MM_CMPLT_EPI8:
            ret = "MM_CMPLT_EPI8";
            break;
        case IT_MM_SUB_EPI8:
            ret = "MM_SUB_EPI8";
            break;
        case IT_MM_SETR_EPI32:
            ret = "MM_SETR_EPI32";
            break;
        case IT_MM_MIN_EPU8:
            ret = "MM_MIN_EPU8";
            break;
               
        }        
        
        return ret;
    }


#define ASSERT_RETURN(x) if ( !(x) ) return false;

    static float ranf(void)
    {
        uint32_t ir = rand() & 0x7FFF;
        return (float)ir*(1.0f / 32768.0f);
    }

    static float ranf(float low, float high)
    {
        return ranf()*(high - low) + low;
    }

    bool validateInt(__m128i a, int32_t x, int32_t y, int32_t z, int32_t w)
    {
        const int32_t *t = (const int32_t *)&a;
        ASSERT_RETURN(t[3] == x);
        ASSERT_RETURN(t[2] == y);
        ASSERT_RETURN(t[1] == z);
        ASSERT_RETURN(t[0] == w);
        return true;
    }

    bool validateInt16(__m128i a, int16_t d0, int16_t d1, int16_t d2, int16_t d3, int16_t d4, int16_t d5, int16_t d6, int16_t d7)
    {
        const int16_t *t = (const int16_t *)&a;
        ASSERT_RETURN(t[0] == d0);
        ASSERT_RETURN(t[1] == d1);
        ASSERT_RETURN(t[2] == d2);
        ASSERT_RETURN(t[3] == d3);
        ASSERT_RETURN(t[4] == d4);
        ASSERT_RETURN(t[5] == d5);
        ASSERT_RETURN(t[6] == d6);
        ASSERT_RETURN(t[7] == d7);
        return true;
    }

    bool validateInt8(__m128i a, int8_t d0, int8_t d1, int8_t d2, int8_t d3, int8_t d4, int8_t d5, int8_t d6, int8_t d7, int8_t d8, int8_t d9, int8_t d10, int8_t d11, int8_t d12, int8_t d13, int8_t d14, int8_t d15)
    {
        const int8_t *t = (const int8_t *)&a;
        ASSERT_RETURN(t[0] == d0);
        ASSERT_RETURN(t[1] == d1);
        ASSERT_RETURN(t[2] == d2);
        ASSERT_RETURN(t[3] == d3);
        ASSERT_RETURN(t[4] == d4);
        ASSERT_RETURN(t[5] == d5);
        ASSERT_RETURN(t[6] == d6);
        ASSERT_RETURN(t[7] == d7);
        ASSERT_RETURN(t[8] == d8);
        ASSERT_RETURN(t[9] == d9);
        ASSERT_RETURN(t[10] == d10);
        ASSERT_RETURN(t[11] == d11);
        ASSERT_RETURN(t[12] == d12);
        ASSERT_RETURN(t[13] == d13);
        ASSERT_RETURN(t[14] == d14);
        ASSERT_RETURN(t[15] == d15);
        return true;
    }
    
    bool validateSingleFloatPair(float a, float b)
    {
        const uint32_t *ia = (const uint32_t *)&a;
        const uint32_t *ib = (const uint32_t *)&b;
        return (*ia) == (*ib) ? true : false;   // We do an integer (binary) compare rather than a floating point compare to take nands and infinities into account as well.
    }

    bool validateFloat(__m128 a, float x, float y, float z, float w)
    {
        const float *t = (const float *)&a;
        ASSERT_RETURN(validateSingleFloatPair(t[3],x));
        ASSERT_RETURN(validateSingleFloatPair(t[2],y));
        ASSERT_RETURN(validateSingleFloatPair(t[1],z));
        ASSERT_RETURN(validateSingleFloatPair(t[0],w));
        return true;
    }

    bool validateFloatEpsilon(__m128 a, float x, float y, float z, float w, float epsilon)
    {
        const float *t = (const float *)&a;
        float dx = fabsf(t[3] - x);
        float dy = fabsf(t[2] - y);
        float dz = fabsf(t[1] - z);
        float dw = fabsf(t[0] - w);
        ASSERT_RETURN(dx < epsilon);
        ASSERT_RETURN(dy < epsilon);
        ASSERT_RETURN(dz < epsilon);
        ASSERT_RETURN(dw < epsilon);
        return true;
    }


    bool test_mm_setzero_si128(void)
    {
        __m128i a = _mm_setzero_si128();
        return validateInt(a, 0, 0, 0, 0);
    }

    bool test_mm_setzero_ps(void)
    {
        __m128 a = _mm_setzero_ps();
        return validateFloat(a, 0, 0, 0, 0);
    }

    bool test_mm_set1_ps(float w)
    {
        __m128 a = _mm_set1_ps(w);
        return validateFloat(a, w, w, w, w);
    }

    bool test_mm_set_ps(float x, float y, float z, float w)
    {
        __m128 a = _mm_set_ps(x, y, z, w);
        return validateFloat(a, x, y, z, w);
    }

    bool test_mm_set1_epi32(int32_t i)
    {
        __m128i a = _mm_set1_epi32(i);
        return validateInt(a, i, i, i, i);
    }

    bool testret_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w)
    {
        __m128i a = _mm_set_epi32(x, y, z, w);
        return validateInt(a, x, y, z, w);
    }

    __m128i test_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w)
    {
        __m128i a = _mm_set_epi32(x, y, z, w);
        validateInt(a, x, y, z, w);
        return a;
    }

    bool test_mm_store_ps(float *p, float x, float y, float z, float w)
    {
        __m128 a = _mm_set_ps(x, y, z, w);
        _mm_store_ps(p, a);
        ASSERT_RETURN(p[0] == w);
        ASSERT_RETURN(p[1] == z);
        ASSERT_RETURN(p[2] == y);
        ASSERT_RETURN(p[3] == x);
        return true;
    }

    bool test_mm_store_ps(int32_t *p, int32_t x, int32_t y, int32_t z, int32_t w)
    {
        __m128i a = _mm_set_epi32(x, y, z, w);
        _mm_store_ps((float *)p, *(const __m128 *)&a);
        ASSERT_RETURN(p[0] == w);
        ASSERT_RETURN(p[1] == z);
        ASSERT_RETURN(p[2] == y);
        ASSERT_RETURN(p[3] == x);
        return true;
    }

    bool test_mm_load1_ps(const float *p)
    {
        __m128 a = _mm_load1_ps(p);
        return validateFloat(a, p[0], p[0], p[0], p[0]);
    }

    __m128 test_mm_load_ps(const float *p)
    {
        __m128 a = _mm_load_ps(p);
        validateFloat(a, p[3], p[2], p[1], p[0]);
        return a;
    }

    __m128i test_mm_load_ps(const int32_t *p)
    {
        __m128 a = _mm_load_ps((const float *)p);
        __m128i ia = *(const __m128i *)&a;
        validateInt(ia, p[3], p[2], p[1], p[0]);
        return ia;
    }


    //r0 := ~a0 & b0
    //r1 := ~a1 & b1
    //r2 := ~a2 & b2
    //r3 := ~a3 & b3
    bool test_mm_andnot_ps(const float *_a, const float *_b)
    {
        bool r = false;

        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);
        __m128 c = _mm_andnot_ps(a, b);
        // now for the assertion...
        const uint32_t *ia = (const uint32_t *)&a;
        const uint32_t *ib = (const uint32_t *)&b;
        uint32_t r0 = ~ia[0] & ib[0];
        uint32_t r1 = ~ia[1] & ib[1];
        uint32_t r2 = ~ia[2] & ib[2];
        uint32_t r3 = ~ia[3] & ib[3];
        __m128i ret = test_mm_set_epi32(r3, r2, r1, r0);
        r = validateInt(*(const __m128i *)&c, r3, r2, r1, r0);
        if (r)
        {
            r = validateInt(ret, r3, r2, r1, r0);
        }
        return r;
    }

    bool test_mm_and_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);
        __m128 c = _mm_and_ps(a, b);
        // now for the assertion...
        const uint32_t *ia = (const uint32_t *)&a;
        const uint32_t *ib = (const uint32_t *)&b;
        uint32_t r0 = ia[0] & ib[0];
        uint32_t r1 = ia[1] & ib[1];
        uint32_t r2 = ia[2] & ib[2];
        uint32_t r3 = ia[3] & ib[3];
        __m128i ret = test_mm_set_epi32(r3, r2, r1, r0);
        bool r = validateInt(*(const __m128i *)&c, r3, r2, r1, r0);
        if (r)
        {
            r = validateInt(ret, r3, r2, r1, r0);
        }
        return r;
    }

    bool test_mm_or_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);
        __m128 c = _mm_or_ps(a, b);
        // now for the assertion...
        const uint32_t *ia = (const uint32_t *)&a;
        const uint32_t *ib = (const uint32_t *)&b;
        uint32_t r0 = ia[0] | ib[0];
        uint32_t r1 = ia[1] | ib[1];
        uint32_t r2 = ia[2] | ib[2];
        uint32_t r3 = ia[3] | ib[3];
        __m128i ret = test_mm_set_epi32(r3, r2, r1, r0);
        bool r = validateInt(*(const __m128i *)&c, r3, r2, r1, r0);
        if (r)
        {
            r = validateInt(ret, r3, r2, r1, r0);
        }
        return r;
    }


    bool test_mm_andnot_si128(const int32_t *_a, const int32_t *_b)
    {
        bool r = true;
        __m128i a = test_mm_load_ps(_a);
        __m128i b = test_mm_load_ps(_b);
        __m128 fc = _mm_andnot_ps(*(const __m128 *)&a, *(const __m128 *)&b);
        __m128i c = *(const __m128i *)&fc;
        // now for the assertion...
        const uint32_t *ia = (const uint32_t *)&a;
        const uint32_t *ib = (const uint32_t *)&b;
        uint32_t r0 = ~ia[0] & ib[0];
        uint32_t r1 = ~ia[1] & ib[1];
        uint32_t r2 = ~ia[2] & ib[2];
        uint32_t r3 = ~ia[3] & ib[3];
        __m128i ret = test_mm_set_epi32(r3, r2, r1, r0);
        r = validateInt(c, r3, r2, r1, r0);
        if (r)
        {
            validateInt(ret, r3, r2, r1, r0);
        }
        return r;
    }

    bool test_mm_and_si128(const int32_t *_a, const int32_t *_b)
    {
        __m128i a = test_mm_load_ps(_a);
        __m128i b = test_mm_load_ps(_b);
        __m128 fc = _mm_and_ps(*(const __m128 *)&a, *(const __m128 *)&b);
        __m128i c = *(const __m128i *)&fc;
        // now for the assertion...
        const uint32_t *ia = (const uint32_t *)&a;
        const uint32_t *ib = (const uint32_t *)&b;
        uint32_t r0 = ia[0] & ib[0];
        uint32_t r1 = ia[1] & ib[1];
        uint32_t r2 = ia[2] & ib[2];
        uint32_t r3 = ia[3] & ib[3];
        __m128i ret = test_mm_set_epi32(r3, r2, r1, r0);
        bool r = validateInt(c, r3, r2, r1, r0);
        if (r)
        {
            r = validateInt(ret, r3, r2, r1, r0);
        }
        return r;
    }

    bool test_mm_or_si128(const int32_t *_a, const int32_t *_b)
    {
        __m128i a = test_mm_load_ps(_a);
        __m128i b = test_mm_load_ps(_b);
        __m128 fc = _mm_or_ps(*(const __m128 *)&a, *(const __m128 *)&b);
        __m128i c = *(const __m128i *)&fc;
        // now for the assertion...
        const uint32_t *ia = (const uint32_t *)&a;
        const uint32_t *ib = (const uint32_t *)&b;
        uint32_t r0 = ia[0] | ib[0];
        uint32_t r1 = ia[1] | ib[1];
        uint32_t r2 = ia[2] | ib[2];
        uint32_t r3 = ia[3] | ib[3];
        __m128i ret = test_mm_set_epi32(r3, r2, r1, r0);
        bool r = validateInt(c, r3, r2, r1, r0);
        if (r)
        {
            r = validateInt(ret, r3, r2, r1, r0);
        }
        return r;
    }

    bool test_mm_movemask_ps(const float *p)
    {
        int ret = 0;

        const uint32_t *ip = (const uint32_t *)p;
        if (ip[0] & 0x80000000)
        {
            ret |= 1;
        }
        if (ip[1] & 0x80000000)
        {
            ret |= 2;
        }
        if (ip[2] & 0x80000000)
        {
            ret |= 4;
        }
        if (ip[3] & 0x80000000)
        {
            ret |= 8;
        }
        __m128 a = test_mm_load_ps(p);
        int val = _mm_movemask_ps(a);
        return val == ret ? true : false;
    }

    // Note, NEON does not have a general purpose shuffled command like SSE. 
    // When invoking this method, there is special code for a number of the most
    // common shuffle permutations
    bool test_mm_shuffle_ps(const float *_a, const float *_b)
    {
        bool isValid = true;
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);
        // Test many permutations of the shuffle operation, including all permutations which have an optmized/custom implementation
        __m128 ret;
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 1, 2, 3));
        if (!validateFloat(ret, _b[0], _b[1], _a[2], _a[3]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 2, 1, 0));
        if (!validateFloat(ret, _b[3], _b[2], _a[1], _a[0]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 0, 1, 1));
        if (!validateFloat(ret, _b[0], _b[0], _a[1], _a[1]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 0, 2));
        if (!validateFloat(ret, _b[3], _b[1], _a[0], _a[2]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2));
        if (!validateFloat(ret, _b[1], _b[0], _a[3], _a[2]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 3, 0, 1));
        if (!validateFloat(ret, _b[2], _b[3], _a[0], _a[1]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 0, 2, 2));
        if (!validateFloat(ret, _b[0], _b[0], _a[2], _a[2]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 2, 0, 0));
        if (!validateFloat(ret, _b[2], _b[2], _a[0], _a[0]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 2, 0, 2));
        if (!validateFloat(ret, _b[3], _b[2], _a[0], _a[2]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 1, 3, 3));
        if (!validateFloat(ret, _b[1], _b[1], _a[3], _a[3]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 1, 0));
        if (!validateFloat(ret, _b[2], _b[0], _a[1], _a[0]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 0, 1));
        if (!validateFloat(ret, _b[2], _b[0], _a[0], _a[1]))
        {
            isValid = false;
        }
        ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 3, 2));
        if (!validateFloat(ret, _b[2], _b[0], _a[3], _a[2]))
        {
            isValid = false;
        }

        return isValid;
    }

    bool test_mm_movemask_epi8(const int32_t *_a)
    {
        __m128i a = test_mm_load_ps(_a);

        const uint8_t *ip = (const uint8_t *)_a;
        int ret = 0;
        uint32_t mask = 1;
        for (uint32_t i = 0; i < 16; i++)
        {
            if (ip[i] & 0x80)
            {
                ret |= mask;
            }
            mask = mask << 1;
        }
        int test = _mm_movemask_epi8(a);
        ASSERT_RETURN(test == ret);
        return true;
    }

    bool test_mm_sub_ps(const float *_a, const float *_b)
    {
        float dx = _a[0] - _b[0];
        float dy = _a[1] - _b[1];
        float dz = _a[2] - _b[2];
        float dw = _a[3] - _b[3];
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        __m128 c = _mm_sub_ps(a, b);
        return validateFloat(c, dw, dz, dy, dx);
    }

    bool test_mm_sub_epi32(const int32_t *_a, const int32_t *_b)
    {
        int32_t dx = _a[0] - _b[0];
        int32_t dy = _a[1] - _b[1];
        int32_t dz = _a[2] - _b[2];
        int32_t dw = _a[3] - _b[3];
        __m128i a = test_mm_load_ps(_a);
        __m128i b = test_mm_load_ps(_b);

        __m128i c = _mm_sub_epi32(a, b);
        return validateInt(c, dw, dz, dy, dx);
    }

    bool test_mm_add_ps(const float *_a, const float *_b)
    {
        float dx = _a[0] + _b[0];
        float dy = _a[1] + _b[1];
        float dz = _a[2] + _b[2];
        float dw = _a[3] + _b[3];
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        __m128 c = _mm_add_ps(a, b);
        return validateFloat(c, dw, dz, dy, dx);
    }

    bool test_mm_add_epi32(const int32_t *_a, const int32_t *_b)
    {
        int32_t dx = _a[0] + _b[0];
        int32_t dy = _a[1] + _b[1];
        int32_t dz = _a[2] + _b[2];
        int32_t dw = _a[3] + _b[3];
        __m128i a = test_mm_load_ps(_a);
        __m128i b = test_mm_load_ps(_b);

        __m128i c = _mm_add_epi32(a, b);
        return validateInt(c, dw, dz, dy, dx);
    }

    bool test_mm_mullo_epi16(const int16_t *_a, const int16_t *_b)
    {
        int16_t d0 = _a[0] * _b[0];
        int16_t d1 = _a[1] * _b[1];
        int16_t d2 = _a[2] * _b[2];
        int16_t d3 = _a[3] * _b[3];
        int16_t d4 = _a[4] * _b[4];
        int16_t d5 = _a[5] * _b[5];
        int16_t d6 = _a[6] * _b[6];
        int16_t d7 = _a[7] * _b[7];

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);

        __m128i c = _mm_mullo_epi16(a, b);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);
    }

    bool test_mm_mul_ps(const float *_a, const float *_b)
    {
        float dx = _a[0] * _b[0];
        float dy = _a[1] * _b[1];
        float dz = _a[2] * _b[2];
        float dw = _a[3] * _b[3];
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        __m128 c = _mm_mul_ps(a, b);
        return validateFloat(c, dw, dz, dy, dx);
    }

    bool test_mm_rcp_ps(const float *_a)
    {
        float dx = 1.0f / _a[0];
        float dy = 1.0f / _a[1];
        float dz = 1.0f / _a[2];
        float dw = 1.0f / _a[3];
        __m128 a = test_mm_load_ps(_a);
        __m128 c = _mm_rcp_ps(a);
        return validateFloatEpsilon(c, dw, dz, dy, dx, 300.0f);
    }

    bool test_mm_max_ps(const float *_a, const float *_b)
    {
        float c[4];

        c[0] = _a[0] > _b[0] ? _a[0] : _b[0];
        c[1] = _a[1] > _b[1] ? _a[1] : _b[1];
        c[2] = _a[2] > _b[2] ? _a[2] : _b[2];
        c[3] = _a[3] > _b[3] ? _a[3] : _b[3];

        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        __m128 ret = _mm_max_ps(a, b);
        return validateFloat(ret, c[3], c[2], c[1], c[0]);

    }

    bool test_mm_min_ps(const float *_a, const float *_b)
    {
        float c[4];

        c[0] = _a[0] < _b[0] ? _a[0] : _b[0];
        c[1] = _a[1] < _b[1] ? _a[1] : _b[1];
        c[2] = _a[2] < _b[2] ? _a[2] : _b[2];
        c[3] = _a[3] < _b[3] ? _a[3] : _b[3];

        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        __m128 ret = _mm_min_ps(a, b);
        return validateFloat(ret, c[3], c[2], c[1], c[0]);
    }

    bool test_mm_min_epi16(const int16_t *_a, const int16_t *_b)
    {
        int16_t d0 = _a[0] < _b[0] ? _a[0] : _b[0];
        int16_t d1 = _a[1] < _b[1] ? _a[1] : _b[1];
        int16_t d2 = _a[2] < _b[2] ? _a[2] : _b[2];
        int16_t d3 = _a[3] < _b[3] ? _a[3] : _b[3];
        int16_t d4 = _a[4] < _b[4] ? _a[4] : _b[4];
        int16_t d5 = _a[5] < _b[5] ? _a[5] : _b[5];
        int16_t d6 = _a[6] < _b[6] ? _a[6] : _b[6];
        int16_t d7 = _a[7] < _b[7] ? _a[7] : _b[7];

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);

        __m128i c = _mm_min_epi16(a, b);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);
    }

    bool test_mm_mulhi_epi16(const int16_t *_a, const int16_t *_b)
    {
        int16_t d[8];
        for (uint32_t i = 0; i < 8; i++)
        {
            int32_t m = (int32_t)_a[i] * (int32_t)_b[i];
            d[i] = (int16_t)(m >> 16);
        }

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);

        __m128i c = _mm_mulhi_epi16(a, b);
        return validateInt16(c, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
    }

    bool test_mm_cmplt_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        int32_t result[4];
        result[0] = _a[0] < _b[0] ? -1 : 0;
        result[1] = _a[1] < _b[1] ? -1 : 0;
        result[2] = _a[2] < _b[2] ? -1 : 0;
        result[3] = _a[3] < _b[3] ? -1 : 0;

        __m128 ret = _mm_cmplt_ps(a, b);
        __m128i iret = *(const __m128i *)&ret;
        return validateInt(iret, result[3], result[2], result[1], result[0]);
    }

    bool test_mm_cmpgt_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        int32_t result[4];
        result[0] = _a[0] > _b[0] ? -1 : 0;
        result[1] = _a[1] > _b[1] ? -1 : 0;
        result[2] = _a[2] > _b[2] ? -1 : 0;
        result[3] = _a[3] > _b[3] ? -1 : 0;

        __m128 ret = _mm_cmpgt_ps(a, b);
        __m128i iret = *(const __m128i *)&ret;
        return validateInt(iret, result[3], result[2], result[1], result[0]);
    }

    bool test_mm_cmpge_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        int32_t result[4];
        result[0] = _a[0] >= _b[0] ? -1 : 0;
        result[1] = _a[1] >= _b[1] ? -1 : 0;
        result[2] = _a[2] >= _b[2] ? -1 : 0;
        result[3] = _a[3] >= _b[3] ? -1 : 0;

        __m128 ret = _mm_cmpge_ps(a, b);
        __m128i iret = *(const __m128i *)&ret;
        return validateInt(iret, result[3], result[2], result[1], result[0]);
    }

    bool test_mm_cmple_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        int32_t result[4];
        result[0] = _a[0] <= _b[0] ? -1 : 0;
        result[1] = _a[1] <= _b[1] ? -1 : 0;
        result[2] = _a[2] <= _b[2] ? -1 : 0;
        result[3] = _a[3] <= _b[3] ? -1 : 0;

        __m128 ret = _mm_cmple_ps(a, b);
        __m128i iret = *(const __m128i *)&ret;
        return validateInt(iret, result[3], result[2], result[1], result[0]);
    }

    bool test_mm_cmpeq_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        int32_t result[4];
        result[0] = _a[0] == _b[0] ? -1 : 0;
        result[1] = _a[1] == _b[1] ? -1 : 0;
        result[2] = _a[2] == _b[2] ? -1 : 0;
        result[3] = _a[3] == _b[3] ? -1 : 0;

        __m128 ret = _mm_cmpeq_ps(a, b);
        __m128i iret = *(const __m128i *)&ret;
        return validateInt(iret, result[3], result[2], result[1], result[0]);
    }


    bool test_mm_cmplt_epi32(const int32_t *_a, const int32_t *_b)
    {
        __m128i a = test_mm_load_ps(_a);
        __m128i b = test_mm_load_ps(_b);

        int32_t result[4];
        result[0] = _a[0] < _b[0] ? -1 : 0;
        result[1] = _a[1] < _b[1] ? -1 : 0;
        result[2] = _a[2] < _b[2] ? -1 : 0;
        result[3] = _a[3] < _b[3] ? -1 : 0;

        __m128i iret = _mm_cmplt_epi32(a, b);
        return validateInt(iret, result[3], result[2], result[1], result[0]);
    }

    bool test_mm_cmpgt_epi32(const int32_t *_a, const int32_t *_b)
    {
        __m128i a = test_mm_load_ps(_a);
        __m128i b = test_mm_load_ps(_b);

        int32_t result[4];

        result[0] = _a[0] > _b[0] ? -1 : 0;
        result[1] = _a[1] > _b[1] ? -1 : 0;
        result[2] = _a[2] > _b[2] ? -1 : 0;
        result[3] = _a[3] > _b[3] ? -1 : 0;

        __m128i iret = _mm_cmpgt_epi32(a, b);
        return validateInt(iret, result[3], result[2], result[1], result[0]);
    }



    float compord(float a, float b)
    {
        float ret;

        bool isNANA = isNAN(a);
        bool isNANB = isNAN(b);
        if ( !isNANA &&  !isNANB)
        {
            ret = getNAN();
        }
        else
        {
            ret = 0.0f;
        }
        return ret;
    }

    bool test_mm_cmpord_ps(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        float result[4];

        for (uint32_t i = 0; i < 4; i++)
        {
            result[i] = compord(_a[i], _b[i]);
        }

        __m128 ret = _mm_cmpord_ps(a, b);

        return validateFloat(ret, result[3], result[2], result[1], result[0]);
    }
//********************************************
    int32_t comilt_ss(float a, float b)
    {
        int32_t ret;

        bool isNANA = isNAN(a);
        bool isNANB = isNAN(b);
        if (!isNANA && !isNANB)
        {
            ret = a < b ? 1 : 0;
        }
        else
        {
            ret = 0;        // **NOTE** The documentation on MSDN is in error!  The actual hardware returns a 0, not a 1 if either of the values is a NAN!
        }
        return ret;
    }


    bool test_mm_comilt_ss(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);


        int32_t result = comilt_ss(_a[0], _b[0]);

        int32_t ret = _mm_comilt_ss(a, b);

        return result == ret ? true : false;
    }
//********************************************

//********************************************
    int32_t comigt_ss(float a, float b)
    {
        int32_t ret;

        bool isNANA = isNAN(a);
        bool isNANB = isNAN(b);
        if (!isNANA && !isNANB)
        {
            ret = a > b ? 1 : 0;
        }
        else
        {
            ret = 0;        // **NOTE** The documentation on MSDN is in error!  The actual hardware returns a 0, not a 1 if either of the values is a NAN!
        }
        return ret;
    }

    bool test_mm_comigt_ss(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);


        int32_t result = comigt_ss(_a[0], _b[0]);

        int32_t ret = _mm_comigt_ss(a, b);

        return result == ret ? true : false;
    }
//********************************************

//********************************************
    int32_t comile_ss(float a, float b)
    {
        int32_t ret;

        bool isNANA = isNAN(a);
        bool isNANB = isNAN(b);
        if (!isNANA && !isNANB)
        {
            ret = a <= b ? 1 : 0;
        }
        else
        {
            ret = 0;        // **NOTE** The documentation on MSDN is in error!  The actual hardware returns a 0, not a 1 if either of the values is a NAN!
        }
        return ret;
    }

    bool test_mm_comile_ss(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);


        int32_t result = comile_ss(_a[0], _b[0]);

        int32_t ret = _mm_comile_ss(a, b);

        return result == ret ? true : false;
    }
//********************************************

//********************************************
    int32_t comige_ss(float a, float b)
    {
        int32_t ret;

        bool isNANA = isNAN(a);
        bool isNANB = isNAN(b);
        if (!isNANA && !isNANB)
        {
            ret = a >= b ? 1 : 0;
        }
        else
        {
            ret = 0;        // **NOTE** The documentation on MSDN is in error!  The actual hardware returns a 0, not a 1 if either of the values is a NAN!
        }
        return ret;
    }

    bool test_mm_comige_ss(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);


        int32_t result = comige_ss(_a[0], _b[0]);

        int32_t ret = _mm_comige_ss(a, b);

        return result == ret ? true : false;
    }
    //********************************************

    //********************************************
    int32_t comieq_ss(float a, float b)
    {
        int32_t ret;

        bool isNANA = isNAN(a);
        bool isNANB = isNAN(b);
        if (!isNANA && !isNANB)
        {
            ret = a == b ? 1 : 0;
        }
        else
        {
            ret = 0;        // **NOTE** The documentation on MSDN is in error!  The actual hardware returns a 0, not a 1 if either of the values is a NAN!
        }
        return ret;
    }

    bool test_mm_comieq_ss(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);

        int32_t result = comieq_ss(_a[0], _b[0]);

        int32_t ret = _mm_comieq_ss(a, b);

        return result == ret ? true : false;
    }
    //********************************************


    //********************************************
    int32_t comineq_ss(float a, float b)
    {
        int32_t ret;

        bool isNANA = isNAN(a);
        bool isNANB = isNAN(b);
        if (!isNANA && !isNANB)
        {
            ret = a != b ? 1 : 0;
        }
        else
        {
            ret = 1;
        }
        return ret;
    }

    bool test_mm_comineq_ss(const float *_a, const float *_b)
    {
        __m128 a = test_mm_load_ps(_a);
        __m128 b = test_mm_load_ps(_b);


        int32_t result = comineq_ss(_a[0], _b[0]);

        int32_t ret = _mm_comineq_ss(a, b);

        return result == ret ? true : false;
    }
    //********************************************




    bool test_mm_cvttps_epi32(const float *_a)
    {
        __m128 a = test_mm_load_ps(_a);
        int32_t trun[4];
        for (uint32_t i = 0; i < 4; i++)
        {
            trun[i] = (int32_t)_a[i];
        }

        __m128i ret = _mm_cvttps_epi32(a);
        return validateInt(ret, trun[3], trun[2], trun[1], trun[0]);
    }

    bool test_mm_cvtepi32_ps(const int32_t *_a)
    {
        __m128i a = test_mm_load_ps(_a);
        float trun[4];
        for (uint32_t i = 0; i < 4; i++)
        {
            trun[i] = (float)_a[i];
        }

        __m128 ret = _mm_cvtepi32_ps(a);
        return validateFloat(ret, trun[3], trun[2], trun[1], trun[0]);
    }

    // https://msdn.microsoft.com/en-us/library/xdc42k5e%28v=vs.90%29.aspx?f=255&MSPPError=-2147217396
    bool test_mm_cvtps_epi32(const float _a[4])
    {
        __m128 a = test_mm_load_ps(_a);
        int32_t trun[4];
        for (uint32_t i = 0; i < 4; i++)
        {
            trun[i] = (int32_t)(bankersRounding(_a[i]));
        }

        __m128i ret = _mm_cvtps_epi32(a);
        return validateInt(ret, trun[3], trun[2], trun[1], trun[0]);
    }
    
    
    //following tests added by hasindu
    
    bool test_mm_set1_epi16(const int16_t *_a)
    {
        int16_t d0 = _a[0];

        __m128i c = _mm_set1_epi16(d0);
        return validateInt16(c, d0, d0, d0, d0, d0, d0, d0, d0);            
    }   
    
    
    bool test_mm_set_epi16(const int16_t *_a)
    {
        int16_t d0 = _a[0];
        int16_t d1 = _a[1];
        int16_t d2 = _a[2];
        int16_t d3 = _a[3];
        int16_t d4 = _a[4];
        int16_t d5 = _a[5];
        int16_t d6 = _a[6];
        int16_t d7 = _a[7];   

        __m128i c = _mm_set_epi16(d7, d6, d5, d4, d3, d2, d1, d0);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);            
    }
    
    bool test_mm_slli_epi16(const int16_t *_a)
    {
        
        const int count = 3;
        
        int16_t d0 = _a[0]<<count;
        int16_t d1 = _a[1]<<count;
        int16_t d2 = _a[2]<<count;
        int16_t d3 = _a[3]<<count;
        int16_t d4 = _a[4]<<count;
        int16_t d5 = _a[5]<<count;
        int16_t d6 = _a[6]<<count;
        int16_t d7 = _a[7]<<count;   

        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i c = _mm_slli_epi16(a,count);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);            
    }    
    
    bool test_mm_srli_epi16(const int16_t *_a)
    {
        const int count = 3;
        
        int16_t d0 = (uint16_t)(_a[0])>>count;
        int16_t d1 = (uint16_t)(_a[1])>>count;
        int16_t d2 = (uint16_t)(_a[2])>>count;
        int16_t d3 = (uint16_t)(_a[3])>>count;
        int16_t d4 = (uint16_t)(_a[4])>>count;
        int16_t d5 = (uint16_t)(_a[5])>>count;
        int16_t d6 = (uint16_t)(_a[6])>>count;
        int16_t d7 = (uint16_t)(_a[7])>>count;   

        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i c = _mm_srli_epi16(a,count);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);                   
    } 
    
    bool test_mm_cmpeq_epi16(const int16_t *_a, const int16_t *_b)
    {
        int16_t d0 = (_a[0]==_b[0]) ? 0xffff : 0x0;
        int16_t d1 = (_a[1]==_b[1]) ? 0xffff : 0x0;
        int16_t d2 = (_a[2]==_b[2]) ? 0xffff : 0x0;
        int16_t d3 = (_a[3]==_b[3]) ? 0xffff : 0x0;;
        int16_t d4 = (_a[4]==_b[4]) ? 0xffff : 0x0;;
        int16_t d5 = (_a[5]==_b[5]) ? 0xffff : 0x0;
        int16_t d6 = (_a[6]==_b[6]) ? 0xffff : 0x0;;
        int16_t d7 = (_a[7]==_b[7]) ? 0xffff : 0x0;   

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_cmpeq_epi16(a,b);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);            
    }     

    bool test_mm_set1_epi8(const int8_t *_a)
    {
        int8_t d0 = _a[0];
        __m128i c = _mm_set1_epi8(d0);
        return validateInt8(c, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0);
    }

    bool test_mm_adds_epu8(const int8_t *_a, const int8_t *_b)
    {
        uint8_t d0  = (uint8_t)_a[0]  + (uint8_t)_b[0] ;   if(d0  < (uint8_t)_a[0] ) d0  = 255;
        uint8_t d1  = (uint8_t)_a[1]  + (uint8_t)_b[1] ;   if(d1  < (uint8_t)_a[1] ) d1  = 255;
        uint8_t d2  = (uint8_t)_a[2]  + (uint8_t)_b[2] ;   if(d2  < (uint8_t)_a[2] ) d2  = 255;
        uint8_t d3  = (uint8_t)_a[3]  + (uint8_t)_b[3] ;   if(d3  < (uint8_t)_a[3] ) d3  = 255;
        uint8_t d4  = (uint8_t)_a[4]  + (uint8_t)_b[4] ;   if(d4  < (uint8_t)_a[4] ) d4  = 255;
        uint8_t d5  = (uint8_t)_a[5]  + (uint8_t)_b[5] ;   if(d5  < (uint8_t)_a[5] ) d5  = 255;
        uint8_t d6  = (uint8_t)_a[6]  + (uint8_t)_b[6] ;   if(d6  < (uint8_t)_a[6] ) d6  = 255;
        uint8_t d7  = (uint8_t)_a[7]  + (uint8_t)_b[7] ;   if(d7  < (uint8_t)_a[7] ) d7  = 255;
        uint8_t d8  = (uint8_t)_a[8]  + (uint8_t)_b[8] ;   if(d8  < (uint8_t)_a[8] ) d8  = 255;
        uint8_t d9  = (uint8_t)_a[9]  + (uint8_t)_b[9] ;   if(d9  < (uint8_t)_a[9] ) d9  = 255;
        uint8_t d10 = (uint8_t)_a[10] + (uint8_t)_b[10] ;  if(d10 < (uint8_t)_a[10]) d10 = 255;
        uint8_t d11 = (uint8_t)_a[11] + (uint8_t)_b[11] ;  if(d11 < (uint8_t)_a[11]) d11 = 255;
        uint8_t d12 = (uint8_t)_a[12] + (uint8_t)_b[12] ;  if(d12 < (uint8_t)_a[12]) d12 = 255;
        uint8_t d13 = (uint8_t)_a[13] + (uint8_t)_b[13] ;  if(d13 < (uint8_t)_a[13]) d13 = 255;
        uint8_t d14 = (uint8_t)_a[14] + (uint8_t)_b[14] ;  if(d14 < (uint8_t)_a[14]) d14 = 255;
        uint8_t d15 = (uint8_t)_a[15] + (uint8_t)_b[15] ;  if(d15 < (uint8_t)_a[15]) d15 = 255;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_adds_epu8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_subs_epu8(const int8_t *_a, const int8_t *_b)
    {
        uint8_t d0  = (uint8_t)_a[0]  - (uint8_t)_b[0] ;   if(d0  > (uint8_t)_a[0] ) d0  = 0;
        uint8_t d1  = (uint8_t)_a[1]  - (uint8_t)_b[1] ;   if(d1  > (uint8_t)_a[1] ) d1  = 0;
        uint8_t d2  = (uint8_t)_a[2]  - (uint8_t)_b[2] ;   if(d2  > (uint8_t)_a[2] ) d2  = 0;
        uint8_t d3  = (uint8_t)_a[3]  - (uint8_t)_b[3] ;   if(d3  > (uint8_t)_a[3] ) d3  = 0;
        uint8_t d4  = (uint8_t)_a[4]  - (uint8_t)_b[4] ;   if(d4  > (uint8_t)_a[4] ) d4  = 0;
        uint8_t d5  = (uint8_t)_a[5]  - (uint8_t)_b[5] ;   if(d5  > (uint8_t)_a[5] ) d5  = 0;
        uint8_t d6  = (uint8_t)_a[6]  - (uint8_t)_b[6] ;   if(d6  > (uint8_t)_a[6] ) d6  = 0;
        uint8_t d7  = (uint8_t)_a[7]  - (uint8_t)_b[7] ;   if(d7  > (uint8_t)_a[7] ) d7  = 0;
        uint8_t d8  = (uint8_t)_a[8]  - (uint8_t)_b[8] ;   if(d8  > (uint8_t)_a[8] ) d8  = 0;
        uint8_t d9  = (uint8_t)_a[9]  - (uint8_t)_b[9] ;   if(d9  > (uint8_t)_a[9] ) d9  = 0;
        uint8_t d10 = (uint8_t)_a[10] - (uint8_t)_b[10] ;  if(d10 > (uint8_t)_a[10]) d10 = 0;
        uint8_t d11 = (uint8_t)_a[11] - (uint8_t)_b[11] ;  if(d11 > (uint8_t)_a[11]) d11 = 0;
        uint8_t d12 = (uint8_t)_a[12] - (uint8_t)_b[12] ;  if(d12 > (uint8_t)_a[12]) d12 = 0;
        uint8_t d13 = (uint8_t)_a[13] - (uint8_t)_b[13] ;  if(d13 > (uint8_t)_a[13]) d13 = 0;
        uint8_t d14 = (uint8_t)_a[14] - (uint8_t)_b[14] ;  if(d14 > (uint8_t)_a[14]) d14 = 0;
        uint8_t d15 = (uint8_t)_a[15] - (uint8_t)_b[15] ;  if(d15 > (uint8_t)_a[15]) d15 = 0;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_subs_epu8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_max_epu8(const int8_t *_a, const int8_t *_b)
    {
        uint8_t d0  = ((uint8_t)_a[0]  > (uint8_t)_b[0] ) ? ((uint8_t)_a[0] ) : ((uint8_t)_b[0] );
        uint8_t d1  = ((uint8_t)_a[1]  > (uint8_t)_b[1] ) ? ((uint8_t)_a[1] ) : ((uint8_t)_b[1] );
        uint8_t d2  = ((uint8_t)_a[2]  > (uint8_t)_b[2] ) ? ((uint8_t)_a[2] ) : ((uint8_t)_b[2] );
        uint8_t d3  = ((uint8_t)_a[3]  > (uint8_t)_b[3] ) ? ((uint8_t)_a[3] ) : ((uint8_t)_b[3] );
        uint8_t d4  = ((uint8_t)_a[4]  > (uint8_t)_b[4] ) ? ((uint8_t)_a[4] ) : ((uint8_t)_b[4] );
        uint8_t d5  = ((uint8_t)_a[5]  > (uint8_t)_b[5] ) ? ((uint8_t)_a[5] ) : ((uint8_t)_b[5] );
        uint8_t d6  = ((uint8_t)_a[6]  > (uint8_t)_b[6] ) ? ((uint8_t)_a[6] ) : ((uint8_t)_b[6] );
        uint8_t d7  = ((uint8_t)_a[7]  > (uint8_t)_b[7] ) ? ((uint8_t)_a[7] ) : ((uint8_t)_b[7] );
        uint8_t d8  = ((uint8_t)_a[8]  > (uint8_t)_b[8] ) ? ((uint8_t)_a[8] ) : ((uint8_t)_b[8] );
        uint8_t d9  = ((uint8_t)_a[9]  > (uint8_t)_b[9] ) ? ((uint8_t)_a[9] ) : ((uint8_t)_b[9] );
        uint8_t d10 = ((uint8_t)_a[10] > (uint8_t)_b[10]) ? ((uint8_t)_a[10]) : ((uint8_t)_b[10]);
        uint8_t d11 = ((uint8_t)_a[11] > (uint8_t)_b[11]) ? ((uint8_t)_a[11]) : ((uint8_t)_b[11]);
        uint8_t d12 = ((uint8_t)_a[12] > (uint8_t)_b[12]) ? ((uint8_t)_a[12]) : ((uint8_t)_b[12]);
        uint8_t d13 = ((uint8_t)_a[13] > (uint8_t)_b[13]) ? ((uint8_t)_a[13]) : ((uint8_t)_b[13]);
        uint8_t d14 = ((uint8_t)_a[14] > (uint8_t)_b[14]) ? ((uint8_t)_a[14]) : ((uint8_t)_b[14]);
        uint8_t d15 = ((uint8_t)_a[15] > (uint8_t)_b[15]) ? ((uint8_t)_a[15]) : ((uint8_t)_b[15]);
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_max_epu8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_cmpeq_epi8(const int8_t *_a, const int8_t *_b)
    {
        int8_t d0  = (_a[0]  == _b[0] ) ? 0xff : 0x00 ;
        int8_t d1  = (_a[1]  == _b[1] ) ? 0xff : 0x00 ;
        int8_t d2  = (_a[2]  == _b[2] ) ? 0xff : 0x00 ;
        int8_t d3  = (_a[3]  == _b[3] ) ? 0xff : 0x00 ;
        int8_t d4  = (_a[4]  == _b[4] ) ? 0xff : 0x00 ;
        int8_t d5  = (_a[5]  == _b[5] ) ? 0xff : 0x00 ;
        int8_t d6  = (_a[6]  == _b[6] ) ? 0xff : 0x00 ;
        int8_t d7  = (_a[7]  == _b[7] ) ? 0xff : 0x00 ;
        int8_t d8  = (_a[8]  == _b[8] ) ? 0xff : 0x00 ;
        int8_t d9  = (_a[9]  == _b[9] ) ? 0xff : 0x00 ;
        int8_t d10 = (_a[10] == _b[10]) ? 0xff : 0x00 ;
        int8_t d11 = (_a[11] == _b[11]) ? 0xff : 0x00 ;
        int8_t d12 = (_a[12] == _b[12]) ? 0xff : 0x00 ;
        int8_t d13 = (_a[13] == _b[13]) ? 0xff : 0x00 ;
        int8_t d14 = (_a[14] == _b[14]) ? 0xff : 0x00 ;
        int8_t d15 = (_a[15] == _b[15]) ? 0xff : 0x00 ;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_cmpeq_epi8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_adds_epi16(const int16_t *_a, const int16_t *_b)
    {
        int32_t d0 = (int32_t)_a[0] + (int32_t)_b[0]; if(d0 > 32767) d0=32767; if(d0 < -32768) d0=-32768;
        int32_t d1 = (int32_t)_a[1] + (int32_t)_b[1]; if(d1 > 32767) d1=32767; if(d1 < -32768) d1=-32768;
        int32_t d2 = (int32_t)_a[2] + (int32_t)_b[2]; if(d2 > 32767) d2=32767; if(d2 < -32768) d2=-32768;
        int32_t d3 = (int32_t)_a[3] + (int32_t)_b[3]; if(d3 > 32767) d3=32767; if(d3 < -32768) d3=-32768;
        int32_t d4 = (int32_t)_a[4] + (int32_t)_b[4]; if(d4 > 32767) d4=32767; if(d4 < -32768) d4=-32768;
        int32_t d5 = (int32_t)_a[5] + (int32_t)_b[5]; if(d5 > 32767) d5=32767; if(d5 < -32768) d5=-32768;
        int32_t d6 = (int32_t)_a[6] + (int32_t)_b[6]; if(d6 > 32767) d6=32767; if(d6 < -32768) d6=-32768;
        int32_t d7 = (int32_t)_a[7] + (int32_t)_b[7]; if(d7 > 32767) d7=32767; if(d7 < -32768) d7=-32768;

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);

        __m128i c = _mm_adds_epi16(a, b);
        return validateInt16(c, (int16_t)d0, (int16_t)d1, (int16_t)d2, (int16_t)d3, (int16_t)d4, (int16_t)d5, (int16_t)d6, (int16_t)d7);
    }

    bool test_mm_max_epi16(const int16_t *_a, const int16_t *_b)
    {
        int16_t d0 = _a[0] > _b[0] ? _a[0] : _b[0];
        int16_t d1 = _a[1] > _b[1] ? _a[1] : _b[1];
        int16_t d2 = _a[2] > _b[2] ? _a[2] : _b[2];
        int16_t d3 = _a[3] > _b[3] ? _a[3] : _b[3];
        int16_t d4 = _a[4] > _b[4] ? _a[4] : _b[4];
        int16_t d5 = _a[5] > _b[5] ? _a[5] : _b[5];
        int16_t d6 = _a[6] > _b[6] ? _a[6] : _b[6];
        int16_t d7 = _a[7] > _b[7] ? _a[7] : _b[7];

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);

        __m128i c = _mm_max_epi16(a, b);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);
    }

    bool test_mm_subs_epu16(const int16_t *_a, const int16_t *_b)
    {
        uint16_t d0  = (uint16_t)_a[0]  - (uint16_t)_b[0] ;   if(d0  > (uint16_t)_a[0] ) d0  = 0;
        uint16_t d1  = (uint16_t)_a[1]  - (uint16_t)_b[1] ;   if(d1  > (uint16_t)_a[1] ) d1  = 0;
        uint16_t d2  = (uint16_t)_a[2]  - (uint16_t)_b[2] ;   if(d2  > (uint16_t)_a[2] ) d2  = 0;
        uint16_t d3  = (uint16_t)_a[3]  - (uint16_t)_b[3] ;   if(d3  > (uint16_t)_a[3] ) d3  = 0;
        uint16_t d4  = (uint16_t)_a[4]  - (uint16_t)_b[4] ;   if(d4  > (uint16_t)_a[4] ) d4  = 0;
        uint16_t d5  = (uint16_t)_a[5]  - (uint16_t)_b[5] ;   if(d5  > (uint16_t)_a[5] ) d5  = 0;
        uint16_t d6  = (uint16_t)_a[6]  - (uint16_t)_b[6] ;   if(d6  > (uint16_t)_a[6] ) d6  = 0;
        uint16_t d7  = (uint16_t)_a[7]  - (uint16_t)_b[7] ;   if(d7  > (uint16_t)_a[7] ) d7  = 0;

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);

        __m128i c = _mm_subs_epu16(a, b);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);
    }

    bool test_mm_cmpgt_epi16(const int16_t *_a, const int16_t *_b)
    {
        uint16_t d0 = _a[0] > _b[0] ? 0xffff : 0;
        uint16_t d1 = _a[1] > _b[1] ? 0xffff : 0;
        uint16_t d2 = _a[2] > _b[2] ? 0xffff : 0;
        uint16_t d3 = _a[3] > _b[3] ? 0xffff : 0;
        uint16_t d4 = _a[4] > _b[4] ? 0xffff : 0;
        uint16_t d5 = _a[5] > _b[5] ? 0xffff : 0;
        uint16_t d6 = _a[6] > _b[6] ? 0xffff : 0;
        uint16_t d7 = _a[7] > _b[7] ? 0xffff : 0;

        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);

        __m128i c = _mm_cmpgt_epi16(a, b);
        return validateInt16(c, d0, d1, d2, d3, d4, d5, d6, d7);
    }

    bool test_mm_loadu_si128(const int32_t *_a)
    {
        __m128i c = _mm_loadu_si128((const __m128i*)_a);
        return validateInt(c, _a[3], _a[2], _a[1], _a[0]);
        
    }

    bool test_mm_storeu_si128(const int32_t *_a)
    {
       __m128i b; 
       __m128i a = _mm_loadu_si128((const __m128i*)_a); 
       _mm_storeu_si128(&b,a);
       int32_t *_b = (int32_t *)&b;
       return validateInt(a, _b[3], _b[2], _b[1], _b[0]);
       return 1;
    }

    bool test_mm_add_epi8(const int8_t *_a, const int8_t *_b)
    {
        int8_t d0 = _a[0] + _b[0] ;
        int8_t d1 = _a[1] + _b[1] ; 
        int8_t d2 = _a[2] + _b[2] ;
        int8_t d3 = _a[3] + _b[3] ;
        int8_t d4 = _a[4] + _b[4] ;
        int8_t d5 = _a[5] + _b[5] ;
        int8_t d6 = _a[6] + _b[6] ;
        int8_t d7 = _a[7] + _b[7] ;
        int8_t d8 = _a[8] + _b[8] ;
        int8_t d9 = _a[9] + _b[9] ;
        int8_t d10 = _a[10] + _b[10] ;
        int8_t d11 = _a[11] + _b[11] ;
        int8_t d12 = _a[12] + _b[12] ;
        int8_t d13 = _a[13] + _b[13] ;
        int8_t d14 = _a[14] + _b[14] ;
        int8_t d15 = _a[15] + _b[15] ;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_add_epi8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_cmpgt_epi8(const int8_t *_a, const int8_t *_b)
    {
        int8_t d0  = (_a[0]  > _b[0] ) ? 0xff : 0x00 ;
        int8_t d1  = (_a[1]  > _b[1] ) ? 0xff : 0x00 ;
        int8_t d2  = (_a[2]  > _b[2] ) ? 0xff : 0x00 ;
        int8_t d3  = (_a[3]  > _b[3] ) ? 0xff : 0x00 ;
        int8_t d4  = (_a[4]  > _b[4] ) ? 0xff : 0x00 ;
        int8_t d5  = (_a[5]  > _b[5] ) ? 0xff : 0x00 ;
        int8_t d6  = (_a[6]  > _b[6] ) ? 0xff : 0x00 ;
        int8_t d7  = (_a[7]  > _b[7] ) ? 0xff : 0x00 ;
        int8_t d8  = (_a[8]  > _b[8] ) ? 0xff : 0x00 ;
        int8_t d9  = (_a[9]  > _b[9] ) ? 0xff : 0x00 ;
        int8_t d10 = (_a[10] > _b[10]) ? 0xff : 0x00 ;
        int8_t d11 = (_a[11] > _b[11]) ? 0xff : 0x00 ;
        int8_t d12 = (_a[12] > _b[12]) ? 0xff : 0x00 ;
        int8_t d13 = (_a[13] > _b[13]) ? 0xff : 0x00 ;
        int8_t d14 = (_a[14] > _b[14]) ? 0xff : 0x00 ;
        int8_t d15 = (_a[15] > _b[15]) ? 0xff : 0x00 ;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_cmpgt_epi8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_cmplt_epi8(const int8_t *_a, const int8_t *_b)
    {
        int8_t d0  = (_a[0]  < _b[0] ) ? 0xff : 0x00 ;
        int8_t d1  = (_a[1]  < _b[1] ) ? 0xff : 0x00 ;
        int8_t d2  = (_a[2]  < _b[2] ) ? 0xff : 0x00 ;
        int8_t d3  = (_a[3]  < _b[3] ) ? 0xff : 0x00 ;
        int8_t d4  = (_a[4]  < _b[4] ) ? 0xff : 0x00 ;
        int8_t d5  = (_a[5]  < _b[5] ) ? 0xff : 0x00 ;
        int8_t d6  = (_a[6]  < _b[6] ) ? 0xff : 0x00 ;
        int8_t d7  = (_a[7]  < _b[7] ) ? 0xff : 0x00 ;
        int8_t d8  = (_a[8]  < _b[8] ) ? 0xff : 0x00 ;
        int8_t d9  = (_a[9]  < _b[9] ) ? 0xff : 0x00 ;
        int8_t d10 = (_a[10] < _b[10]) ? 0xff : 0x00 ;
        int8_t d11 = (_a[11] < _b[11]) ? 0xff : 0x00 ;
        int8_t d12 = (_a[12] < _b[12]) ? 0xff : 0x00 ;
        int8_t d13 = (_a[13] < _b[13]) ? 0xff : 0x00 ;
        int8_t d14 = (_a[14] < _b[14]) ? 0xff : 0x00 ;
        int8_t d15 = (_a[15] < _b[15]) ? 0xff : 0x00 ;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_cmplt_epi8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_sub_epi8(const int8_t *_a, const int8_t *_b)
    {
        int8_t d0 = _a[0]   - _b[0] ;
        int8_t d1 = _a[1]   - _b[1] ;
        int8_t d2 = _a[2]   - _b[2] ;
        int8_t d3 = _a[3]   - _b[3] ;
        int8_t d4 = _a[4]   - _b[4] ;
        int8_t d5 = _a[5]   - _b[5] ;
        int8_t d6 = _a[6]   - _b[6] ;
        int8_t d7 = _a[7]   - _b[7] ;
        int8_t d8 = _a[8]   - _b[8] ;
        int8_t d9 = _a[9]   - _b[9] ;
        int8_t d10 = _a[10] - _b[10] ;
        int8_t d11 = _a[11] - _b[11] ;
        int8_t d12 = _a[12] - _b[12] ;
        int8_t d13 = _a[13] - _b[13] ;
        int8_t d14 = _a[14] - _b[14] ;
        int8_t d15 = _a[15] - _b[15] ;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_sub_epi8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }

    bool test_mm_setr_epi32(const int32_t *_a)
    {   
        __m128i c = _mm_setr_epi32(_a[0], _a[1], _a[2], _a[3]);
        return validateInt(c, _a[3], _a[2], _a[1], _a[0]);        
    }

    bool test_mm_min_epu8(const int8_t *_a, const int8_t *_b)
    {
        uint8_t d0  = ((uint8_t)_a[0]  < (uint8_t)_b[0] ) ? (uint8_t)_a[0] : (uint8_t)_b[0]  ;
        uint8_t d1  = ((uint8_t)_a[1]  < (uint8_t)_b[1] ) ? (uint8_t)_a[1] : (uint8_t)_b[1]  ;
        uint8_t d2  = ((uint8_t)_a[2]  < (uint8_t)_b[2] ) ? (uint8_t)_a[2] : (uint8_t)_b[2]  ;
        uint8_t d3  = ((uint8_t)_a[3]  < (uint8_t)_b[3] ) ? (uint8_t)_a[3] : (uint8_t)_b[3]  ;
        uint8_t d4  = ((uint8_t)_a[4]  < (uint8_t)_b[4] ) ? (uint8_t)_a[4] : (uint8_t)_b[4]  ;
        uint8_t d5  = ((uint8_t)_a[5]  < (uint8_t)_b[5] ) ? (uint8_t)_a[5] : (uint8_t)_b[5]  ;
        uint8_t d6  = ((uint8_t)_a[6]  < (uint8_t)_b[6] ) ? (uint8_t)_a[6] : (uint8_t)_b[6]  ;
        uint8_t d7  = ((uint8_t)_a[7]  < (uint8_t)_b[7] ) ? (uint8_t)_a[7] : (uint8_t)_b[7]  ;
        uint8_t d8  = ((uint8_t)_a[8]  < (uint8_t)_b[8] ) ? (uint8_t)_a[8] : (uint8_t)_b[8]  ;
        uint8_t d9  = ((uint8_t)_a[9]  < (uint8_t)_b[9] ) ? (uint8_t)_a[9] : (uint8_t)_b[9]  ;
        uint8_t d10 = ((uint8_t)_a[10] < (uint8_t)_b[10]) ? (uint8_t)_a[10]: (uint8_t)_b[10] ;
        uint8_t d11 = ((uint8_t)_a[11] < (uint8_t)_b[11]) ? (uint8_t)_a[11]: (uint8_t)_b[11] ;
        uint8_t d12 = ((uint8_t)_a[12] < (uint8_t)_b[12]) ? (uint8_t)_a[12]: (uint8_t)_b[12] ;
        uint8_t d13 = ((uint8_t)_a[13] < (uint8_t)_b[13]) ? (uint8_t)_a[13]: (uint8_t)_b[13] ;
        uint8_t d14 = ((uint8_t)_a[14] < (uint8_t)_b[14]) ? (uint8_t)_a[14]: (uint8_t)_b[14] ;
        uint8_t d15 = ((uint8_t)_a[15] < (uint8_t)_b[15]) ? (uint8_t)_a[15]: (uint8_t)_b[15] ;
        
        __m128i a = test_mm_load_ps((const int32_t *)_a);
        __m128i b = test_mm_load_ps((const int32_t *)_b);
        __m128i c = _mm_min_epu8(a,b);
        return validateInt8(c, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15);
    }
        
    
// Try 10,000 random floating point values for each test we run
#define MAX_TEST_VALUE 10000


class SSE2NEONTestImpl : public SSE2NEONTest
{
public:
    SSE2NEONTestImpl(void)
    {
        mTestFloatPointer1 = (float *)platformAlignedAlloc(sizeof(__m128));
        mTestFloatPointer2 = (float *)platformAlignedAlloc(sizeof(__m128));
        mTestIntPointer1 = (int32_t *)platformAlignedAlloc(sizeof(__m128i));
        mTestIntPointer2 = (int32_t *)platformAlignedAlloc(sizeof(__m128i));
        srand(0);
        for (uint32_t i = 0; i < MAX_TEST_VALUE; i++)
        {
            mTestFloats[i] = ranf(-100000, 100000);
            mTestInts[i] = (int32_t)ranf(-100000, 100000);
        }
    }

    virtual ~SSE2NEONTestImpl(void)
    {
        platformAlignedFree(mTestFloatPointer1);
        platformAlignedFree(mTestFloatPointer2);
        platformAlignedFree(mTestIntPointer1);
        platformAlignedFree(mTestIntPointer2);
    }

    bool loadTestFloatPointers(uint32_t i)
    {
        bool ret = test_mm_store_ps(mTestFloatPointer1, mTestFloats[i], mTestFloats[i + 1], mTestFloats[i + 2], mTestFloats[i + 3]);
        if (ret)
        {
            ret = test_mm_store_ps(mTestFloatPointer2, mTestFloats[i + 4], mTestFloats[i + 5], mTestFloats[i + 6], mTestFloats[i + 7]);
        }
        return ret;
    }

    bool loadTestIntPointers(uint32_t i)
    {
        bool ret = test_mm_store_ps(mTestIntPointer1, mTestInts[i], mTestInts[i + 1], mTestInts[i + 2], mTestInts[i + 3]);
        if (ret)
        {
            ret = test_mm_store_ps(mTestIntPointer2, mTestInts[i + 4], mTestInts[i + 5], mTestInts[i + 6], mTestInts[i + 7]);
        }

        return ret;
    }

    bool runSingleTest(InstructionTest test,uint32_t i)
    {
        bool ret = true;

        switch ( test )
        {
            case IT_MM_SETZERO_SI128:
                ret = test_mm_setzero_si128();
                break;
            case IT_MM_SETZERO_PS:
                ret = test_mm_setzero_ps();
                break;
            case IT_MM_SET1_PS:
                ret = test_mm_set1_ps(mTestFloats[i]);
                break;
            case IT_MM_SET_PS1:
                ret = test_mm_set1_ps(mTestFloats[i]);
                break;
            case IT_MM_SET_PS:
                ret = test_mm_set_ps(mTestFloats[i], mTestFloats[i + 1], mTestFloats[i + 2], mTestFloats[i + 3]);
                break;
            case IT_MM_SET1_EPI32:
                ret = test_mm_set1_epi32(mTestInts[i]);
                break;
            case IT_MM_SET_EPI32:
                ret = testret_mm_set_epi32(mTestInts[i], mTestInts[i + 1], mTestInts[i + 2], mTestInts[i + 3]);
                break;
            case IT_MM_STORE_PS:
                ret = test_mm_store_ps(mTestIntPointer1, mTestInts[i], mTestInts[i + 1], mTestInts[i + 2], mTestInts[i + 3]);
                break;
            case IT_MM_LOAD1_PS:
                ret = test_mm_load1_ps(mTestFloatPointer1);
                break;
            case IT_MM_ANDNOT_PS:
                ret = test_mm_andnot_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_ANDNOT_SI128:
                ret = test_mm_andnot_si128(mTestIntPointer1, mTestIntPointer2);
                break;
            case IT_MM_AND_SI128:
                ret = test_mm_and_si128(mTestIntPointer1, mTestIntPointer2);
                break;
            case IT_MM_AND_PS:
                ret = test_mm_and_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_OR_PS:
                ret = test_mm_or_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_OR_SI128:
                ret = test_mm_or_si128(mTestIntPointer1, mTestIntPointer2);
                break;
            case IT_MM_MOVEMASK_PS:
                ret = test_mm_movemask_ps(mTestFloatPointer1);
                break;
            case IT_MM_SHUFFLE_PS:
                ret = test_mm_shuffle_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_MOVEMASK_EPI8:
                ret = test_mm_movemask_epi8(mTestIntPointer1);
                break;
            case IT_MM_SUB_PS:
                ret = test_mm_sub_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_SUB_EPI32:
                ret = test_mm_sub_epi32(mTestIntPointer1, mTestIntPointer2);
                break;
            case IT_MM_ADD_PS:
                ret = test_mm_add_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_ADD_EPI32:
                ret = test_mm_add_epi32(mTestIntPointer1, mTestIntPointer2);
                break;
            case IT_MM_MULLO_EPI16:
                ret = test_mm_mullo_epi16((const int16_t *)mTestIntPointer1, (const int16_t *)mTestIntPointer2);
                break;
            case IT_MM_MUL_PS:
                ret = test_mm_mul_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_RCP_PS:
                ret = test_mm_rcp_ps(mTestFloatPointer1);
                break;
            case IT_MM_MAX_PS:
                ret = test_mm_max_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_MIN_PS:
                ret = test_mm_min_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_MIN_EPI16:
                ret = test_mm_min_epi16((const int16_t *)mTestIntPointer1, (const int16_t *)mTestIntPointer2);
                break;
            case IT_MM_MULHI_EPI16:
                ret = test_mm_mulhi_epi16((const int16_t *)mTestIntPointer1, (const int16_t *)mTestIntPointer2);
                break;
            case IT_MM_CMPLT_PS:
                ret = test_mm_cmplt_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_CMPGT_PS:
                ret = test_mm_cmpgt_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_CMPGE_PS:
                ret = test_mm_cmpge_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_CMPLE_PS:
                ret = test_mm_cmple_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_CMPEQ_PS:
                ret = test_mm_cmpeq_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_CMPLT_EPI32:
                ret = test_mm_cmplt_epi32(mTestIntPointer1, mTestIntPointer2);
                break;
            case IT_MM_CMPGT_EPI32:
                ret = test_mm_cmpgt_epi32(mTestIntPointer1, mTestIntPointer2);
                break;
            case IT_MM_CVTTPS_EPI32:
                ret = test_mm_cvttps_epi32(mTestFloatPointer1);
                break;
            case IT_MM_CVTEPI32_PS:
                ret = test_mm_cvtepi32_ps(mTestIntPointer1);
                break;
            case IT_MM_CVTPS_EPI32:
                ret = test_mm_cvtps_epi32(mTestFloatPointer1);
                break;
            case IT_MM_CMPORD_PS:
                ret = test_mm_cmpord_ps(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_COMILT_SS:
                ret = test_mm_comilt_ss(mTestFloatPointer1, mTestFloatPointer2);
                if (!ret)
                {
                    // Note to Alexander, you need to fix this.
                    ret = test_mm_comilt_ss(mTestFloatPointer1, mTestFloatPointer2);
                }
                break;
            case IT_MM_COMIGT_SS:
                ret = test_mm_comigt_ss(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_COMILE_SS:
                ret = test_mm_comile_ss(mTestFloatPointer1, mTestFloatPointer2);
                if (!ret)
                {
                    // Note to Alexander, you need to fix this.
                    ret = test_mm_comile_ss(mTestFloatPointer1, mTestFloatPointer2);
                }
                break;
            case IT_MM_COMIGE_SS:
                ret = test_mm_comige_ss(mTestFloatPointer1, mTestFloatPointer2);
                break;
            case IT_MM_COMIEQ_SS:
                ret = test_mm_comieq_ss(mTestFloatPointer1, mTestFloatPointer2);
                if (!ret)
                {
                    // Note to Alexander, you need to fix this.
                    ret = test_mm_comieq_ss(mTestFloatPointer1, mTestFloatPointer2);
                }
                break;
            case IT_MM_COMINEQ_SS:
                ret = test_mm_comineq_ss(mTestFloatPointer1, mTestFloatPointer2);
                if (!ret)
                {
                    // Note to Alexander, you need to fix this.
                    ret = test_mm_comineq_ss(mTestFloatPointer1, mTestFloatPointer2);
                }
                break;
            case IT_MM_HADD_PS:
                ret = true;
                break;
            case IT_MM_MAX_EPI32:
                ret = true;
                break;
            case IT_MM_MIN_EPI32:
                ret = true;
                break;
            case IT_MM_MAX_SS:
                ret = true;
                break;
            case IT_MM_MIN_SS:
                ret = true;
                break;
            case IT_MM_SQRT_PS:
                ret = true;
                break;
            case IT_MM_SQRT_SS:
                ret = true;
                break;
            case IT_MM_RSQRT_PS:
                ret = true;
                break;
            case IT_MM_DIV_PS:
                ret = true;
                break;
            case IT_MM_DIV_SS:
                ret = true;
                break;
            case IT_MM_MULLO_EPI32:
                ret = true;
                break;
            case IT_MM_ADD_EPI16:
                ret = true;
                break;
            case IT_MM_ADD_SS:
                ret = true;
                break;
            case IT_MM_SHUFFLE_EPI32_DEFAULT:
                ret = true;
                break;
            case IT_MM_SHUFFLE_EPI32_FUNCTION:
                ret = true;
                break;
            case IT_MM_SHUFFLE_EPI32_SPLAT:
                ret = true;
                break;
            case IT_MM_SHUFFLE_EPI32_SINGLE:
                ret = true;
                break;
            case IT_MM_SHUFFLEHI_EPI16_FUNCTION:
                ret = true;
                break;
            case IT_MM_XOR_SI128:
                ret = true;
                break;
            case IT_MM_XOR_PS:
                ret = true;
                break;
            case IT_MM_LOAD_PS:
                ret = true;
                break;
            case IT_MM_LOADU_PS:
                ret = true;
                break;
            case IT_MM_LOAD_SS:
                ret = true;
                break;
            case IT_MM_CMPNEQ_PS:
                ret = true;
                break;
            case IT_MM_STOREU_PS:
                ret = true;
                break;
            case IT_MM_STORE_SI128:
                ret = true;
                break;
            case IT_MM_STORE_SS:
                ret = true;
                break;
            case IT_MM_STOREL_EPI64:
                ret = true;
                break;
            case IT_MM_SETR_PS:
                ret = true;
                break;
            case IT_MM_CVTSI128_SI32:
                ret = true;
                break;
            case IT_MM_CVTSI32_SI128:
                ret = true;
                break;
            case IT_MM_CASTPS_SI128:
                ret = true;
                break;
            case IT_MM_CASTSI128_PS:
                ret = true;
                break;
            case IT_MM_LOAD_SI128:
                ret = true;
                break;
            case IT_MM_PACKS_EPI16:
                ret = true;
                break;
            case IT_MM_PACKUS_EPI16:
                ret = true;
                break;
            case IT_MM_PACKS_EPI32:
                ret = true;
                break;
            case IT_MM_UNPACKLO_EPI8:
                ret = true;
                break;
            case IT_MM_UNPACKLO_EPI16:
                ret = true;
                break;
            case IT_MM_UNPACKLO_EPI32:
                ret = true;
                break;
            case IT_MM_UNPACKLO_PS:
                ret = true;
                break;
            case IT_MM_UNPACKHI_PS:
                ret = true;
                break;
            case IT_MM_UNPACKHI_EPI8:
                ret = true;
                break;
            case IT_MM_UNPACKHI_EPI16:
                ret = true;
                break;
            case IT_MM_UNPACKHI_EPI32:
                ret = true;
                break;
            case IT_MM_SFENCE:
                ret = true;
                break;
            case IT_MM_STREAM_SI128:
                ret = true;
                break;
            case IT_MM_CLFLUSH:
                ret = true;
                break;
                
                
            case IT_MM_CVTSS_F32:
                ret = true;
                break;
                
            //added by hasindu
            case IT_MM_SET1_EPI16:
                ret = test_mm_set1_epi16((const int16_t *)mTestIntPointer1);
                break;                
            case IT_MM_SET_EPI16:
                ret = test_mm_set_epi16((const int16_t *)mTestIntPointer1);
                break;
            case IT_MM_SLLI_EPI16:
                ret = test_mm_slli_epi16((const int16_t *)mTestIntPointer1);
                break;
            case IT_MM_SRLI_EPI16:
                ret = test_mm_srli_epi16((const int16_t *)mTestIntPointer1);
                break;
            case IT_MM_CMPEQ_EPI16:
                ret = test_mm_cmpeq_epi16((const int16_t *)mTestIntPointer1,(const int16_t *)mTestIntPointer2);
                break;  
            case IT_MM_SET1_EPI8:
                ret = test_mm_set1_epi8((const int8_t *)mTestIntPointer1);
                break;
            case IT_MM_ADDS_EPU8:
                ret = test_mm_adds_epu8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_SUBS_EPU8:
                ret = test_mm_subs_epu8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_MAX_EPU8:
                ret = test_mm_max_epu8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_CMPEQ_EPI8:
                ret = test_mm_cmpeq_epi8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_ADDS_EPI16:
                ret = test_mm_adds_epi16((const int16_t *)mTestIntPointer1,(const int16_t *)mTestIntPointer2);
                break;
            case IT_MM_MAX_EPI16:
                ret = test_mm_max_epi16((const int16_t *)mTestIntPointer1,(const int16_t *)mTestIntPointer2);
                break;
            case IT_MM_SUBS_EPU16:
                ret = test_mm_subs_epu16((const int16_t *)mTestIntPointer1,(const int16_t *)mTestIntPointer2);
                break;
            case IT_MM_CMPGT_EPI16:
                ret = test_mm_cmpgt_epi16((const int16_t *)mTestIntPointer1,(const int16_t *)mTestIntPointer2);
                break;
            case IT_MM_LOADU_SI128:
                ret = test_mm_loadu_si128((const int32_t *)mTestIntPointer1);
                break;
            case IT_MM_STOREU_SI128:
                ret = test_mm_storeu_si128((const int32_t *)mTestIntPointer1);
                break;
            case IT_MM_ADD_EPI8:
                ret = test_mm_add_epi8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_CMPGT_EPI8:
                ret = test_mm_cmpgt_epi8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_CMPLT_EPI8:
                ret = test_mm_cmplt_epi8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_SUB_EPI8:
                ret = test_mm_sub_epi8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;
            case IT_MM_SETR_EPI32:
                ret = test_mm_setr_epi32((const int32_t *)mTestIntPointer1);
                break;
            case IT_MM_MIN_EPU8:
                ret = test_mm_min_epu8((const int8_t *)mTestIntPointer1,(const int8_t *)mTestIntPointer2);
                break;


                
        }


        return ret;
    }


    virtual bool runTest(InstructionTest test)
    {
        bool ret = true;


        // Test a whole bunch of values
        for (uint32_t i = 0; i < (MAX_TEST_VALUE - 8); i++)
        {
            ret = loadTestFloatPointers(i);	// Load some random float values
            if ( !ret ) break; // load test float failed??
            ret = loadTestIntPointers(i);	// load some random int values
            if ( !ret ) break; // load test float failed??
            // If we are testing the reciprocal, then invert the input data (easier for debugging)
            if ( test == IT_MM_RCP_PS )
            {

                mTestFloatPointer1[0] = 1.0f / mTestFloatPointer1[0];
                mTestFloatPointer1[1] = 1.0f / mTestFloatPointer1[1];
                mTestFloatPointer1[2] = 1.0f / mTestFloatPointer1[2];
                mTestFloatPointer1[3] = 1.0f / mTestFloatPointer1[3];
            }
            if ( test == IT_MM_CMPGE_PS || test == IT_MM_CMPLE_PS || test == IT_MM_CMPEQ_PS )
            {
               // Make sure at least one value is the same.
               mTestFloatPointer1[3] = mTestFloatPointer2[3];
            }

            if (test == IT_MM_CMPORD_PS || 
                test == IT_MM_COMILT_SS || 
                test == IT_MM_COMILE_SS ||
                test == IT_MM_COMIGE_SS ||
                test == IT_MM_COMIEQ_SS ||
                test == IT_MM_COMINEQ_SS ||
                test == IT_MM_COMIGT_SS) // if testing for NAN's make sure we have some nans
            {
                // One out of four times
                // Make sure a couple of values have NANs for testing purposes
                if ((rand() & 3) == 0)
                {
                    uint32_t r1 = rand() & 3;
                    uint32_t r2 = rand() & 3;
                    mTestFloatPointer1[r1] = getNAN();
                    mTestFloatPointer2[r2] = getNAN();
                }
            }

            // one out of every random 64 times or so, mix up the test floats to contain some integer values
            if ((rand() & 63) == 0)
            {
                uint32_t option = rand() & 3;
                switch (option)
                {
                    // All integers..
                    case 0:
                        mTestFloatPointer1[0] = float(mTestIntPointer1[0]);
                        mTestFloatPointer1[1] = float(mTestIntPointer1[1]);
                        mTestFloatPointer1[2] = float(mTestIntPointer1[2]);
                        mTestFloatPointer1[3] = float(mTestIntPointer1[3]);

                        mTestFloatPointer2[0] = float(mTestIntPointer2[0]);
                        mTestFloatPointer2[1] = float(mTestIntPointer2[1]);
                        mTestFloatPointer2[2] = float(mTestIntPointer2[2]);
                        mTestFloatPointer2[3] = float(mTestIntPointer2[3]);

                        break;
                    case 1:
                        {
                            uint32_t index = rand() & 3;
                            mTestFloatPointer1[index] = float(mTestIntPointer1[index]);
                            index = rand() & 3;
                            mTestFloatPointer2[index] = float(mTestIntPointer2[index]);
                        }
                        break;
                    case 2:
                        {
                            uint32_t index1 = rand() & 3;
                            uint32_t index2 = rand() & 3;
                            mTestFloatPointer1[index1] = float(mTestIntPointer1[index1]);
                            mTestFloatPointer1[index2] = float(mTestIntPointer1[index2]);
                            index1 = rand() & 3;
                            index2 = rand() & 3;
                            mTestFloatPointer1[index1] = float(mTestIntPointer1[index1]);
                            mTestFloatPointer1[index2] = float(mTestIntPointer1[index2]);
                        }
                        break;
                    case 3:
                        mTestFloatPointer1[0] = float(mTestIntPointer1[0]);
                        mTestFloatPointer1[1] = float(mTestIntPointer1[1]);
                        mTestFloatPointer1[2] = float(mTestIntPointer1[2]);
                        mTestFloatPointer1[3] = float(mTestIntPointer1[3]);
                        break;
                }
                if ((rand() & 3) == 0) // one out of 4 times, make halves
                {
                    for (uint32_t j = 0; j < 4; j++)
                    {
                        mTestFloatPointer1[j] *= 0.5f;
                        mTestFloatPointer2[j] *= 0.5f;
                    }
                }
            }
#if 0
            {
                mTestFloatPointer1[0] = getNAN();
                mTestFloatPointer2[0] = getNAN();
                bool ok = test_mm_comilt_ss(mTestFloatPointer1, mTestFloatPointer1);
                if (!ok)
                {
                    printf("Debug me");
                }
            }
#endif
            ret = runSingleTest(test,i);
            if ( !ret ) // the test failed...
            {
                // Set a breakpoint here if you want to step through the failure case in the debugger
                ret = runSingleTest(test,i);
                break;
            }
        }
        return ret;
    }

    virtual void release(void)
    {
        delete this;
    }

    float       *mTestFloatPointer1;
    float       *mTestFloatPointer2;
    int32_t     *mTestIntPointer1;
    int32_t     *mTestIntPointer2;
    float       mTestFloats[MAX_TEST_VALUE];
    int32_t     mTestInts[MAX_TEST_VALUE];
};

SSE2NEONTest *SSE2NEONTest::create(void)
{
    SSE2NEONTestImpl *st = new SSE2NEONTestImpl;
    return static_cast<SSE2NEONTest *>(st);
}

} // end of SSE2NEON namespace
